obs_csv <- file.path(dir_data, "obs.csv")
obs_geo <- file.path(dir_data, "obs.geojson")
redo <- F
# get species occurrence data from GBIF with coordinates
if(!file.exists(obs_geo) | redo){
(res <- spocc::occ(
query = 'Phoenicopterus ruber',
from = 'gbif',
has_coords = T,
limit = 10000
))
df <- res$gbif$data[[1]]
nrow(df) # number of rows
readr::write_csv(df, obs_csv)
obs <- df %>%
sf::st_as_sf(
coords = c("longitude", "latitude"),
crs = st_crs(4326)) %>%
select(prov, key) # save space (joinable from obs_csv)
sf::write_sf(obs, obs_geo, delete_dsn=T)
}
obs <- sf::read_sf(obs_geo)
nrow(obs)
## [1] 10000
# show points on map
mapview::mapview(obs, map.types = "OpenTopoMap")
dir_env <- file.path(dir_data, "env")
# set a default data directory
options(sdmpredictors_datadir = dir_env)
# choosing marine
env_datasets <- sdmpredictors::list_datasets(terrestrial = T, marine = F)
# show table of datasets
env_datasets %>%
select(dataset_code, description, citation) %>%
DT::datatable()
# choose datasets for a vector
env_datasets_vec <- c("WorldClim", "ENVIREM")
# get layers
env_layers <- sdmpredictors::list_layers(env_datasets_vec)
DT::datatable(env_layers)
# choose layers after some inspection and perhaps consulting literature
env_layers_vec <- c("WC_alt", "WC_bio1", "WC_bio2", "ER_tri", "ER_topoWet")
# get layers
env_stack <- load_layers(env_layers_vec)
# interactive plot layers, hiding all but first (select others)
plot(env_stack, nc=2)
obs_hull_geo <- file.path(dir_data, "obs_hull.geojson")
env_stack_grd <- file.path(dir_data, "env_stack.grd")
if (!file.exists(obs_hull_geo) | redo){
# make convex hull around points of observation
obs_hull <- sf::st_convex_hull(st_union(obs))
# save obs hull
write_sf(obs_hull, obs_hull_geo)
}
obs_hull <- read_sf(obs_hull_geo)
# show points on map
mapview(
list(obs, obs_hull))
if (!file.exists(env_stack_grd) | redo){
obs_hull_sp <- sf::as_Spatial(obs_hull)
env_stack <- raster::mask(env_stack, obs_hull_sp) %>%
raster::crop(extent(obs_hull_sp))
writeRaster(env_stack, env_stack_grd, overwrite=T)
}
env_stack <- stack(env_stack_grd)
# show map
# mapview(obs) +
# mapview(env_stack, hide = T) # makes html too big for Github
plot(env_stack, nc=2)
absence_geo <- file.path(dir_data, "absence.geojson")
pts_geo <- file.path(dir_data, "pts.geojson")
pts_env_csv <- file.path(dir_data, "pts_env.csv")
if (!file.exists(absence_geo) | redo){
# get raster count of observations
r_obs <- rasterize(
sf::as_Spatial(obs), env_stack[[1]], field=1, fun='count')
# show map
# mapview(obs) +
# mapview(r_obs)
# create mask for
r_mask <- mask(env_stack[[1]] > -Inf, r_obs, inverse=T)
# generate random points inside mask
absence <- dismo::randomPoints(r_mask, nrow(obs)) %>%
as_tibble() %>%
st_as_sf(coords = c("x", "y"), crs = 4326)
write_sf(absence, absence_geo, delete_dsn=T)
}
absence <- read_sf(absence_geo)
# show map of presence, ie obs, and absence
mapview(obs, col.regions = "green") +
mapview(absence, col.regions = "gray")
## Warning in cbind(`Feature ID` = fid, mat): number of rows of result is not a
## multiple of vector length (arg 1)
if (!file.exists(pts_env_csv) | redo){
# combine presence and absence into single set of labeled points
pts <- rbind(
obs %>%
mutate(
present = 1) %>%
select(present, key),
absence %>%
mutate(
present = 0,
key = NA)) %>%
mutate(
ID = 1:n()) %>%
relocate(ID)
write_sf(pts, pts_geo, delete_dsn=T)
# extract raster values for points
pts_env <- raster::extract(env_stack, as_Spatial(pts), df=TRUE) %>%
tibble() %>%
# join present and geometry columns to raster value results for points
left_join(
pts %>%
select(ID, present),
by = "ID") %>%
relocate(present, .after = ID) %>%
# extract lon, lat as single columns
mutate(
#present = factor(present),
lon = st_coordinates(geometry)[,1],
lat = st_coordinates(geometry)[,2]) %>%
select(-geometry)
write_csv(pts_env, pts_env_csv)
}
pts_env <- read_csv(pts_env_csv)
pts_env %>%
# show first 10 presence, last 10 absence
slice(c(1:10, (nrow(pts_env)-9):nrow(pts_env))) %>%
DT::datatable(
rownames = F,
options = list(
dom = "t",
pageLength = 20))
pts_env %>%
select(-ID) %>%
mutate(
present = factor(present)) %>%
pivot_longer(-present) %>%
ggplot() +
geom_density(aes(x = value, fill = present)) +
scale_fill_manual(values = alpha(c("gray", "green"), 0.5)) +
scale_x_continuous(expand=c(0,0)) +
scale_y_continuous(expand=c(0,0)) +
theme_bw() +
facet_wrap(~name, scales = "free") +
theme(
legend.position = c(1, 0),
legend.justification = c(1, 0))
## Warning: Removed 1044 rows containing non-finite values (stat_density).
librarian::shelf(
DT, dplyr, dismo, GGally, here, readr, tidyr)
##
## The 'cran_repo' argument in shelf() was not set, so it will use
## cran_repo = 'https://cran.r-project.org' by default.
##
## To avoid this message, set the 'cran_repo' argument to a CRAN
## mirror URL (see https://cran.r-project.org/mirrors.html) or set
## 'quiet = TRUE'.
select <- dplyr::select # overwrite raster::select
options(readr.show_col_types = F)
dir_data <- here("data/sdm")
pts_env_csv <- file.path(dir_data, "pts_env.csv")
pts_env <- read_csv(pts_env_csv)
nrow(pts_env)
## [1] 20000
datatable(pts_env, rownames = F)
GGally::ggpairs(
select(pts_env, -ID),
aes(color = factor(present)))
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 197 rows containing missing values
## Warning in cor(x, y): the standard deviation is zero
## Warning in cor(x, y): the standard deviation is zero
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 197 rows containing missing values
## Warning in cor(x, y): the standard deviation is zero
## Warning in cor(x, y): the standard deviation is zero
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 197 rows containing missing values
## Warning in cor(x, y): the standard deviation is zero
## Warning in cor(x, y): the standard deviation is zero
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 301 rows containing missing values
## Warning in cor(x, y): the standard deviation is zero
## Warning in cor(x, y): the standard deviation is zero
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 152 rows containing missing values
## Warning in cor(x, y): the standard deviation is zero
## Warning in cor(x, y): the standard deviation is zero
## Warning in cor(x, y): the standard deviation is zero
## Warning in cor(x, y): the standard deviation is zero
## Warning in cor(x, y): the standard deviation is zero
## Warning in cor(x, y): the standard deviation is zero
## Warning: Removed 197 rows containing missing values (geom_point).
## Warning: Removed 197 rows containing non-finite values (stat_density).
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 197 rows containing missing values
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 197 rows containing missing values
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 332 rows containing missing values
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 201 rows containing missing values
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 197 rows containing missing values
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 197 rows containing missing values
## Warning: Removed 197 rows containing missing values (geom_point).
## Warning: Removed 197 rows containing missing values (geom_point).
## Warning: Removed 197 rows containing non-finite values (stat_density).
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 197 rows containing missing values
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 332 rows containing missing values
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 201 rows containing missing values
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 197 rows containing missing values
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 197 rows containing missing values
## Warning: Removed 197 rows containing missing values (geom_point).
## Warning: Removed 197 rows containing missing values (geom_point).
## Warning: Removed 197 rows containing missing values (geom_point).
## Warning: Removed 197 rows containing non-finite values (stat_density).
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 332 rows containing missing values
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 201 rows containing missing values
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 197 rows containing missing values
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 197 rows containing missing values
## Warning: Removed 301 rows containing missing values (geom_point).
## Warning: Removed 332 rows containing missing values (geom_point).
## Warning: Removed 332 rows containing missing values (geom_point).
## Warning: Removed 332 rows containing missing values (geom_point).
## Warning: Removed 301 rows containing non-finite values (stat_density).
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 303 rows containing missing values
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 301 rows containing missing values
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 301 rows containing missing values
## Warning: Removed 152 rows containing missing values (geom_point).
## Warning: Removed 201 rows containing missing values (geom_point).
## Warning: Removed 201 rows containing missing values (geom_point).
## Warning: Removed 201 rows containing missing values (geom_point).
## Warning: Removed 303 rows containing missing values (geom_point).
## Warning: Removed 152 rows containing non-finite values (stat_density).
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 152 rows containing missing values
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 152 rows containing missing values
## Warning: Removed 197 rows containing missing values (geom_point).
## Warning: Removed 197 rows containing missing values (geom_point).
## Warning: Removed 197 rows containing missing values (geom_point).
## Warning: Removed 301 rows containing missing values (geom_point).
## Warning: Removed 152 rows containing missing values (geom_point).
## Warning: Removed 197 rows containing missing values (geom_point).
## Warning: Removed 197 rows containing missing values (geom_point).
## Warning: Removed 197 rows containing missing values (geom_point).
## Warning: Removed 301 rows containing missing values (geom_point).
## Warning: Removed 152 rows containing missing values (geom_point).
# setup model data
d <- pts_env %>%
select(-ID) %>% # remove terms we don't want to model
tidyr::drop_na() # drop the rows with NA values
nrow(d)
## [1] 19668
# fit a linear model
mdl <- lm(present ~ ., data = d)
summary(mdl)
##
## Call:
## lm(formula = present ~ ., data = d)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.07599 -0.31329 0.07745 0.34631 1.44589
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.570404107 0.046298365 33.919 <0.0000000000000002 ***
## WC_alt -0.000077068 0.000007708 -9.998 <0.0000000000000002 ***
## WC_bio1 -0.039247738 0.000720435 -54.478 <0.0000000000000002 ***
## WC_bio2 -0.051579810 0.001299683 -39.686 <0.0000000000000002 ***
## ER_tri -0.004271455 0.000224818 -19.000 <0.0000000000000002 ***
## ER_topoWet 0.033495629 0.003747848 8.937 <0.0000000000000002 ***
## lon -0.004265912 0.000081128 -52.583 <0.0000000000000002 ***
## lat -0.008736233 0.000171005 -51.087 <0.0000000000000002 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3898 on 19660 degrees of freedom
## Multiple R-squared: 0.3923, Adjusted R-squared: 0.3921
## F-statistic: 1813 on 7 and 19660 DF, p-value: < 0.00000000000000022
y_predict <- predict(mdl, pts_env, type="response")
y_true <- pts_env$present
range(y_predict)
## [1] NA NA
range(y_true)
## [1] 0 1
# fit a generalized linear model with a binomial logit link function
mdl <- glm(present ~ ., family = binomial(link="logit"), data = d)
summary(mdl)
##
## Call:
## glm(formula = present ~ ., family = binomial(link = "logit"),
## data = d)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.8932 -0.6316 -0.0778 0.8326 3.7967
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 8.95095496 0.37591201 23.811 < 0.0000000000000002 ***
## WC_alt -0.00024986 0.00005422 -4.608 0.000004056799 ***
## WC_bio1 -0.30131529 0.00640733 -47.027 < 0.0000000000000002 ***
## WC_bio2 -0.41142538 0.01062817 -38.711 < 0.0000000000000002 ***
## ER_tri -0.03652296 0.00218994 -16.678 < 0.0000000000000002 ***
## ER_topoWet 0.18605164 0.02954437 6.297 0.000000000303 ***
## lon -0.03068770 0.00070107 -43.773 < 0.0000000000000002 ***
## lat -0.06701779 0.00148996 -44.980 < 0.0000000000000002 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 27261 on 19667 degrees of freedom
## Residual deviance: 17253 on 19660 degrees of freedom
## AIC: 17269
##
## Number of Fisher Scoring iterations: 6
y_predict <- predict(mdl, d, type="response")
range(y_predict)
## [1] 0.0001601382 0.9910803961
# show term plots
termplot(mdl, partial.resid = TRUE, se = TRUE, main = F)
librarian::shelf(mgcv)
##
## The 'cran_repo' argument in shelf() was not set, so it will use
## cran_repo = 'https://cran.r-project.org' by default.
##
## To avoid this message, set the 'cran_repo' argument to a CRAN
## mirror URL (see https://cran.r-project.org/mirrors.html) or set
## 'quiet = TRUE'.
# fit a generalized additive model with smooth predictors
mdl <- mgcv::gam(
formula = present ~ s(WC_alt) + s(WC_bio1) +
s(WC_bio2) + s(ER_tri) + s(ER_topoWet) + s(lon) + s(lat),
family = binomial, data = d)
summary(mdl)
##
## Family: binomial
## Link function: logit
##
## Formula:
## present ~ s(WC_alt) + s(WC_bio1) + s(WC_bio2) + s(ER_tri) + s(ER_topoWet) +
## s(lon) + s(lat)
##
## Parametric coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.7728 0.2715 -10.21 <0.0000000000000002 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df Chi.sq p-value
## s(WC_alt) 8.870 8.983 692.12 <0.0000000000000002 ***
## s(WC_bio1) 7.632 8.178 463.75 <0.0000000000000002 ***
## s(WC_bio2) 8.917 8.997 366.79 <0.0000000000000002 ***
## s(ER_tri) 7.447 7.756 83.36 <0.0000000000000002 ***
## s(ER_topoWet) 7.237 8.130 55.26 <0.0000000000000002 ***
## s(lon) 8.997 9.000 632.41 <0.0000000000000002 ***
## s(lat) 8.878 8.989 1022.83 <0.0000000000000002 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## R-sq.(adj) = 0.868 Deviance explained = 81.9%
## UBRE = -0.74255 Scale est. = 1 n = 19668
# show term plots
plot(mdl, scale=0)
Maxent is probably the most commonly used species distrbution model since it performs well with few input data points, only reuires presence points and is easy to use with a Java graphical user interface (GUI).
# load extra packages
librarian::shelf(
maptools, sf)
##
## The 'cran_repo' argument in shelf() was not set, so it will use
## cran_repo = 'https://cran.r-project.org' by default.
##
## To avoid this message, set the 'cran_repo' argument to a CRAN
## mirror URL (see https://cran.r-project.org/mirrors.html) or set
## 'quiet = TRUE'.
# show version of maxent
maxent()
## This is MaxEnt version 3.4.3
# get environmental rasters
# NOTE: the first part of Lab 1. SDM - Explore got updated to write this clipped environmental raster stack
env_stack_grd <- file.path(dir_data, "env_stack.grd")
env_stack <- stack(env_stack_grd)
plot(env_stack, nc=2)
# get presence-only observation points (maxent extracts raster values for you)
obs_geo <- file.path(dir_data, "obs.geojson")
obs_sp <- read_sf(obs_geo) %>%
sf::as_Spatial() # maxent prefers sp::SpatialPoints over newer sf::sf class
# fit a maximum entropy model
mdl <- maxent(env_stack, obs_sp)
## Warning in .local(x, p, ...): 72 (7.42%) of the presence points have NA
## predictor values
## This is MaxEnt version 3.4.3
plot(mdl)
# plot term plots
response(mdl)
# predict
y_predict <- predict(env_stack, mdl) #, ext=ext, progress='')
plot(y_predict, main='Maxent, raw prediction')
data(wrld_simpl, package="maptools")
plot(wrld_simpl, add=TRUE, border='dark grey')
# read data
pts_env <- read_csv(pts_env_csv)
d <- pts_env %>%
select(-ID) %>% # not used as a predictor x
mutate(
present = factor(present)) %>% # categorical response
na.omit() # drop rows with NA
skim(d)
| Name | d |
| Number of rows | 19668 |
| Number of columns | 8 |
| _______________________ | |
| Column type frequency: | |
| factor | 1 |
| numeric | 7 |
| ________________________ | |
| Group variables | None |
Variable type: factor
| skim_variable | n_missing | complete_rate | ordered | n_unique | top_counts |
|---|---|---|---|---|---|
| present | 0 | 1 | FALSE | 2 | 0: 9979, 1: 9689 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| WC_alt | 0 | 1 | 414.38 | 545.31 | -38.00 | 13.00 | 143.50 | 612.25 | 3995.00 | ▇▂▁▁▁ |
| WC_bio1 | 0 | 1 | 22.47 | 4.62 | -0.90 | 18.40 | 24.10 | 26.30 | 30.60 | ▁▁▃▅▇ |
| WC_bio2 | 0 | 1 | 11.51 | 3.24 | 4.90 | 9.20 | 11.30 | 14.20 | 20.40 | ▃▇▇▆▁ |
| ER_tri | 0 | 1 | 14.38 | 21.94 | 0.00 | 2.74 | 7.42 | 15.56 | 295.78 | ▇▁▁▁▁ |
| ER_topoWet | 0 | 1 | 12.24 | 1.41 | 7.00 | 11.36 | 12.41 | 13.34 | 15.80 | ▁▂▆▇▁ |
| lon | 0 | 1 | -27.00 | 49.06 | -117.14 | -76.29 | -7.71 | 19.12 | 40.04 | ▃▆▁▂▇ |
| lat | 0 | 1 | 3.90 | 22.04 | -34.71 | -13.96 | 10.60 | 21.32 | 52.05 | ▆▃▆▇▁ |
# create training set with 80% of full data
d_split <- rsample::initial_split(d, prop = 0.8, strata = "present")
d_train <- rsample::training(d_split)
# show number of rows present is 0 vs 1
table(d$present)
##
## 0 1
## 9979 9689
table(d_train$present)
##
## 0 1
## 7983 7751
# run decision stump model
mdl <- rpart(
present ~ ., data = d_train,
control = list(
cp = 0, minbucket = 5, maxdepth = 1))
mdl
## n= 15734
##
## node), split, n, loss, yval, (yprob)
## * denotes terminal node
##
## 1) root 15734 7751 0 (0.50737257 0.49262743)
## 2) WC_alt>=47.5 9320 1866 0 (0.79978541 0.20021459) *
## 3) WC_alt< 47.5 6414 529 1 (0.08247583 0.91752417) *
# plot tree
par(mar = c(1, 1, 1, 1))
rpart.plot(mdl)
# decision tree with defaults
mdl <- rpart(present ~ ., data = d_train)
mdl
## n= 15734
##
## node), split, n, loss, yval, (yprob)
## * denotes terminal node
##
## 1) root 15734 7751 0 (0.50737257 0.49262743)
## 2) WC_alt>=47.5 9320 1866 0 (0.79978541 0.20021459)
## 4) lat>=-24.99979 7538 353 0 (0.95317060 0.04682940) *
## 5) lat< -24.99979 1782 269 1 (0.15095398 0.84904602) *
## 3) WC_alt< 47.5 6414 529 1 (0.08247583 0.91752417)
## 6) lat>=30.20225 131 19 0 (0.85496183 0.14503817) *
## 7) lat< 30.20225 6283 417 1 (0.06636957 0.93363043) *
rpart.plot(mdl)
# plot complexity parameter
plotcp(mdl)
# rpart cross validation results
mdl$cptable
## CP nsplit rel error xerror xstd
## 1 0.69100761 0 1.0000000 1.0000000 0.008090673
## 2 0.16049542 1 0.3089924 0.3089924 0.005813492
## 3 0.01199845 2 0.1484970 0.1492711 0.004223995
## 4 0.01000000 3 0.1364985 0.1372726 0.004063578
# caret cross validation results
mdl_caret <- train(
present ~ .,
data = d_train,
method = "rpart",
trControl = trainControl(method = "cv", number = 10),
tuneLength = 20)
ggplot(mdl_caret)
vip(mdl_caret, num_features = 40, bar = FALSE)
# Construct partial dependence plots
p1 <- partial(mdl_caret, pred.var = "lat") %>% autoplot()
p2 <- partial(mdl_caret, pred.var = "WC_bio2") %>% autoplot()
p3 <- partial(mdl_caret, pred.var = c("lat", "WC_bio2")) %>%
plotPartial(levelplot = FALSE, zlab = "yhat", drape = TRUE,
colorkey = TRUE, screen = list(z = -20, x = -60))
# Display plots side by side
gridExtra::grid.arrange(p1, p2, p3, ncol = 3)
## Warning: Use of `object[[1L]]` is discouraged. Use `.data[[1L]]` instead.
## Warning: Use of `object[["yhat"]]` is discouraged. Use `.data[["yhat"]]`
## instead.
## Warning: Use of `object[[1L]]` is discouraged. Use `.data[[1L]]` instead.
## Warning: Use of `object[["yhat"]]` is discouraged. Use `.data[["yhat"]]`
## instead.
# number of features
n_features <- length(setdiff(names(d_train), "present"))
# fit a default random forest model
mdl_rf <- ranger(present ~ ., data = d_train)
# get out of the box RMSE
(default_rmse <- sqrt(mdl_rf$prediction.error))
## [1] 0.1090188
# re-run model with impurity-based variable importance
mdl_impurity <- ranger(
present ~ ., data = d_train,
importance = "impurity")
# re-run model with permutation-based variable importance
mdl_permutation <- ranger(
present ~ ., data = d_train,
importance = "permutation")
p1 <- vip::vip(mdl_impurity, bar = FALSE)
p2 <- vip::vip(mdl_permutation, bar = FALSE)
gridExtra::grid.arrange(p1, p2, nrow = 1)